############
# This file merges the Pollution and Stanford Education Data ARchive Data


########################################
setwd("Derived Data")
# Import pollution data: district by month/year pollution measures 
df_polraw=read.csv("PM25_2008_2018_District.csv", stringsAsFactors = FALSE, header=TRUE)
df_polraw$leaid=df_polraw$GEOID  # recode GEOID to be LEAID
df_polraw=df_polraw[,c("year","month","PM25","leaid","NAME")] # keep only columns of interest
#############################################

# Import SEDA FILE
df_sedaraw=fread("../Raw Data/6. SEDA DATA/seda_geodist_long_cs_4.1.csv", header=TRUE, stringsAsFactors = FALSE)  # cs scaled estimates as recommended by SEDA
# year is spring of tested year

########## 1. Cleaning:

########### POLLUTION FILE ##################
df_polraw$monthnum=match(df_polraw$month,month.name)

df_pol<- df_polraw %>% arrange(leaid, year, monthnum) %>% # order by district, then year, then month
  group_by(leaid) %>%  # want to compute average PM25 only within a district 
  mutate(avgPM25_9=zoo::rollmean(PM25,k=9,fill=NA, align='right')) %>%   # get9 month pm25 average

  
  ungroup()
rm(df_polraw)

df_pol$testmonth=df_pol$month
df_pol$testyear_cal=df_pol$year
df_pol[,c("month","year")]=NULL

fwrite(df_pol,"df_pol_diffplot.csv")

########## SEDA FILE #######################
df_seda=df_sedaraw
# Focus on: drop gap variables
cols_remove=c("totgyb_asn","cs_mnse_blk","totgyb_blk","totgyb_fem","totgyb_hsp","cs_mn_nec","cs_mnse_nec","totgyb_wht","totgyb_ecd","totgyb_nec","cs_mn_asn","cs_mn_blk","cs_mn_hsp","cs_mnse_hsp","cs_mn_mal","cs_mnse_mal","totgyb_mtr","totgyb_nam","cs_mnse_mtr","cs_mn_wht","cs_mnse_wht","cs_mn_mtr","cs_mn_ecd","cs_mnse_asn","cs_mnse_ecd","cs_mn_fem","cs_mnse_fem","totgyb_mfg","totgyb_wbg","totgyb_wmg","totgyb_wng","totgyb_neg","cs_mn_mfg","cs_mnse_mfg","cs_mn_neg","cs_mnse_neg","cs_mnse_wbg","cs_mn_whg","cs_mnse_whg","totgyb_whg","cs_mn_wng","cs_mnse_wng","cs_mn_wmg","cs_mnse_wmg","cs_mn_nam","cs_mnse_nam","cs_mn_wag","cs_mnse_wag","totgyb_wag","fips")
df_seda[,cols_remove]=NULL

df_seda$leaid=df_seda$sedalea
rm(df_sedaraw)
############## create testing months
df_seda$testmonth="May" 
# seda year: spring of testing year
# states that tested in the fall and switched in 2015 (spring of school year) :
fall2015=c("ME","MI","NH","ND","VT","WI")
df_seda$testmonth[df_seda$stateabb %in% fall2015 & df_seda$year<=2014]="October" # in 2014-2015 made the switch

# Rhode Island tested in the fall and switched in 2016 to a spring test: 
df_seda$testmonth[df_seda$stateabb=="RI" & df_seda$year<=2015]="October" # in 2015-2016 made the switch

# states that tested year round and switched in 2015:
round2015=c("DE","ID","IA")
df_seda$testmonth[df_seda$stateabb %in% round2015 & df_seda$year<=2014]="February" # in 2014-2015 made the switch
#Oregon tested year round and switched in 2014:
df_seda$testmonth[df_seda$stateabb=="OR" & df_seda$year<=2013]="February" # in 2013-2014 made the switch

# move to calendar year: if the test month was in October for the 2013-2014, then the actual year of the test is year - 1 (2013)
df_seda$testyear_cal=ifelse(df_seda$testmonth=="October",df_seda$year-1,df_seda$year)

df_seda=df_seda[!is.na(df_seda$cs_mn_all),]

#online appendix B: seda data paragraph
nrow(df_seda)
length(unique(df_seda$leaid))

######################### 2. Import demographics

# demographics: do by district grade - year : 

df_demograw=fread("../Raw Data/6. SEDA DATA/seda_cov_geodist_long_4.1.csv", header=TRUE, stringsAsFactors = FALSE)

df_demograw$leaid=df_demograw$sedalea

col_demog=c("leaid","grade","year","perhsp","perblk","perwht","perasn","pernam","perfl","perrl","perfrl","perecd","totenrl","perell","perspeced","baplusall","povertyall","unempall","single_momall","lninc50all")

# keep only certain variables

df_demograw = subset(df_demograw, select = col_demog)

# demographics
df_sedafinal=merge(df_seda,df_demograw,by=c("leaid","year","grade"))

print(nrow(df_sedafinal))
print(length(unique(df_sedafinal$leaid)))


rm(df_demograw)


################### 3. Create a district - grade - year panel: 


# go by subject 

subjects=c("rla","mth")

# create lags 
func_panel<-function(df,subjectval){
  
  # keep the subject
  df=df[df$subject==subjectval,]
  
  # create a cohort variable
  df$yearcohort=df$year-df$grade
  df$leaid_cohort=paste(df$leaid,df$yearcohort,sep="_")
  
  # create measure of male pct 
  df$malpct_test=df$totgyb_mal/(df$totgyb_all)
  
  # lagged test scores and test scores by cohort
  df=df %>% group_by(leaid_cohort) %>% mutate(cs_mn_all_lag_cohort=dplyr::lag(cs_mn_all,default=NA, order_by=year) ) %>% ungroup()                                #               
 
  return(df)
  
}    

# Reading Scores 
df_rla=func_panel(df_sedafinal,subjects[1])
# Math Scores: 
df_mth=func_panel(df_sedafinal,subjects[2]) # checked 

df_mth_rla=rbind(df_mth,df_rla)

dffinal=merge(df_mth_rla,df_pol,by=c("leaid","testyear_cal","testmonth")) # merge the pollution and the district quality 


#num obvs:
sedaobvs=paste("SEDA obvs ", nrow(df_mth_rla), " seda -pol obvs ", nrow(dffinal), "Seda unique leaid ", length(unique(df_mth_rla$leaid)), "Seda -pol unique leaid ", length(unique(dffinal$leaid)),sep="")
print(sedaobvs)

# numobvs lagged 

dflag=dffinal[!is.na(dffinal$cs_mn_all_lag_cohort),]
nrow(dflag)
length(unique(dflag$leaid))
# create total percent tested
dffinal$pcttested= dffinal$totgyb_all/ dffinal$totenrl


# create avg read and math score database: 
write.csv(dffinal,"df_avg_noinst.csv")


#### Black White gap for Table A6
df_sedaraw=fread("../Raw Data/6. SEDA DATA/seda_geodist_long_cs_4.1.csv", header=TRUE, stringsAsFactors = FALSE)  # cs scaled estimates as recommended by SEDA

# create a cohort variable
df_sedaraw$yearcohort=df_sedaraw$year-df_sedaraw$grade
df_sedaraw$leaid_cohort=paste(df_sedaraw$sedalea,df_sedaraw$yearcohort,sep="_")
# black white test score gap 
dfwbg=df_sedaraw
dfwbg=dfwbg%>% group_by(leaid_cohort,subject) %>% mutate(lagcs_mn_wbg=dplyr::lag(cs_mn_wbg,order_by=year,n=1),
                                                         lagcs_mn_wht=dplyr::lag(cs_mn_wht,order_by=year,n=1),
                                                         lagcs_mn_blk=dplyr::lag(cs_mn_blk,order_by=year,n=1))
dfwbgout=dfwbg
dfwbgout$leaid=dfwbgout$sedalea
fwrite(dfwbgout, "df_bwg.csv")



